Load merged Summarized Experiment data and pre-processing
se=readRDS(file = paste0(MEDIAsave,"Mydata_EGAD00001001331_merged.rds"))
colData(se)$Donor=factor(colData(se)$Donor)
colData(se)$Status=factor(colData(se)$Status)
keep <- rowSums(assay(se)>=15) >= 6 #at least 50% of the patients
se1 <- se[keep,]
tmp <-assay(se1)
tmp <-cbind(gene_id=rownames(tmp),tmp)
tmp <- merge(tmp,ens2gene19,by.y="gene_id")
Fix duplicates: mean expression value across transcripts has been
used for each duplicate gene
dupp <-tmp[duplicated(tmp$gene_name),]
gn_dup <-unique(dupp$gene_name)
tmp1 <-tmp[!tmp$gene_name %in% gn_dup,]
counts2 <-as.data.frame(matrix(NA, length(gn_dup),ncol(tmp)))
colnames(counts2) <-colnames(tmp)
rownames(counts2) <-paste0("ENS_",1:length(gn_dup))
counts2 <-counts2[,-c(1,26)]
for(i in 1:length(gn_dup)){
x=gn_dup[i]
y=tmp[tmp$gene_name %in% x,-c(1,26)]
counts2[i,1:ncol(counts2)] <-apply(y,2,function(x) round(mean(as.numeric(x))))
}
counts2 <-cbind(gene_id=rownames(counts2),counts2,gene_name=gn_dup)
tmp3 <-rbind(tmp1,as.matrix(counts2))
dataDds <-tmp3
ccounts <-tmp3[,2:(ncol(tmp3)-1)]
ccounts <-t(apply(ccounts,1,as.numeric))
rownames(ccounts) <-tmp3$gene_id
colnames(ccounts) <-colnames(tmp3[,2:(ncol(tmp3)-1)])
dds <- DESeqDataSetFromMatrix(countData = ccounts,
colData = colData(se),
design = ~ Donor + Status)
dds$Status <- factor(dds$Status, levels = c("Normal","Osteoarthritis"))
dds$Status <- relevel(dds$Status, ref = "Normal")
keep <- rowSums(counts(dds)>=15) >= 6 #re-check low counts genes, 0 in this case
sum(keep) #19124
## [1] 19124
cat("Merged data Dimension after filtering and pre-processing: ", dim(dds))
## Merged data Dimension after filtering and pre-processing: 19124 24
Principal component analysis
vsd <- vst(dds, blind=FALSE)
pcaData <- plotPCA(vsd, intgroup=c("Donor", "Status"), returnData=TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color=Donor, shape=Status,label = rownames(pcaData))) +
geom_point(size=3,alpha=1) +
xlab(paste0("PC1: ",percentVar[1],"% variance")) +
ylab(paste0("PC2: ",percentVar[2],"% variance")) +
coord_fixed()+
ggtitle("All samples after merging replicate Runs")+
theme_bw()

Batch correction for paired
mm <- model.matrix(~Status, colData(vsd))
assay(vsd)<-limma::removeBatchEffect(assay(vsd), vsd$Donor, design=mm)
pcaData <- plotPCA(vsd, intgroup=c("Donor", "Status"), returnData=TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color=Donor, shape=Status,label = rownames(pcaData))) +
geom_point(size=3,alpha=1) +
xlab(paste0("PC1: ",percentVar[1],"% variance")) +
ylab(paste0("PC2: ",percentVar[2],"% variance")) +
coord_fixed()+
ggtitle("All samples after merging replicate Runs")+
theme_bw()

Differential expression analysis using DESeq2
dds <- DESeq(dds)
resultsNames(dds)
## [1] "Intercept" "Donor_SMBP051_vs_SMBP049"
## [3] "Donor_SMBP053_vs_SMBP049" "Donor_SMBP054_vs_SMBP049"
## [5] "Donor_SMBP055_vs_SMBP049" "Donor_SMBP056_vs_SMBP049"
## [7] "Donor_SMBP059_vs_SMBP049" "Donor_SMBP060_vs_SMBP049"
## [9] "Donor_SMBP061_vs_SMBP049" "Donor_SMBP064_vs_SMBP049"
## [11] "Donor_SMBP065_vs_SMBP049" "Donor_SMBP066_vs_SMBP049"
## [13] "Status_Osteoarthritis_vs_Normal"
df.res <- as.data.frame(results(dds,name="Status_Osteoarthritis_vs_Normal" ,pAdjustMethod = "fdr"))
summary(df.res)
## baseMean log2FoldChange lfcSE stat
## Min. : 7 Min. :-2.052493 Min. :0.03297 Min. :-5.82675
## 1st Qu.: 62 1st Qu.:-0.116090 1st Qu.:0.10228 1st Qu.:-0.90492
## Median : 318 Median : 0.000866 Median :0.13994 Median : 0.00505
## Mean : 2015 Mean : 0.037667 Mean :0.16481 Mean : 0.07884
## 3rd Qu.: 1122 3rd Qu.: 0.142020 3rd Qu.:0.20596 3rd Qu.: 0.98097
## Max. :3442514 Max. : 2.257059 Max. :0.97056 Max. : 6.86018
## pvalue padj
## Min. :0.0000 Min. :0.0000001
## 1st Qu.:0.1191 1st Qu.:0.4759139
## Median :0.3460 Median :0.6916913
## Mean :0.3981 Mean :0.6523361
## 3rd Qu.:0.6530 3rd Qu.:0.8705838
## Max. :0.9999 Max. :0.9999275
df.res$ens <-rownames(df.res)
dataDds <-dataDds[,c(1,26)]
colnames(dataDds) <-c("ens","gene_id")
df.res<-merge(dataDds,df.res,by.x="ens")
df.res <-df.res[,c(1,3:8,2)]
colnames(df.res)[8] <-"hgnc_symbol"
rownames(df.res) <-df.res$ens
rm(dataDds,tmp3)
Save files
save(df.res, file=paste0(MEDIAsave,"results_DE_EGApaired.RData"))
Volcano Plot of DE genes
annot <-df.res[,c(3,7,8)]
annot <-annot[complete.cases(annot),]
annot$col <-"gray50"
annot[annot$log2FoldChange <= -1 & annot$padj <=0.05, "col"] <- "springgreen3"
annot[annot$log2FoldChange >= 1 & annot$padj <=0.05, "col"] <- "red"
annot$label <- NA
annot[annot$log2FoldChange >= 1.5 & annot$padj <=0.05, "label"] <- annot[annot$log2FoldChange >= 1.5 & annot$padj <=0.05, "hgnc_symbol"]
annot[annot$log2FoldChange <= -1.5 & annot$padj <=0.05, "label"] <- annot[annot$log2FoldChange <= -1.5 & annot$padj <=0.05, "hgnc_symbol"]
ggplot(data=annot, aes(x=log2FoldChange, y=-log10(padj), color=col,label=label)) +
geom_point(size=1) +
geom_label_repel(size=4,
min.segment.length = 0,direction = "both",
max.overlaps =40,
color="black",
segment.linetype=2) +
scale_shape_manual(values=c(8, 19))+
scale_color_manual(values = c("red","gray50","springgreen3"),
breaks = c("red","gray50","springgreen3"),
labels = c("UP","notDE","DOWN")) +
labs(color="Legend")+
xlab("log2(FC)")+
ylab("-log10(FDR)")+
theme_bw()+
ggtitle("Volcano plot of DE genes - Dataset 2")+
geom_vline(xintercept=c(-1, 1), col="gray", linetype=2)+
geom_hline(yintercept=-log10(0.05), col="gray", linetype=2)

Enrichment with GSEA: BP and REACTOME
hs=get("org.Hs.eg.db")
genes1 <- df.res[, c("log2FoldChange","hgnc_symbol")]
genes <- genes1$log2FoldChange
names(genes) <-genes1$hgnc_symbol
genes <-sort(genes, decreasing = TRUE)
gsebp <- gseGO(geneList=genes,
ont ="BP",
keyType = "SYMBOL",
pvalueCutoff = 0.01,
eps = 0,
verbose = TRUE,
OrgDb = hs,
pAdjustMethod = "fdr")
dotplot(gsebp, showCategory = 10, x="NES",split=".sign") + facet_grid(.~.sign)

rc <-msigdbr(species = "Homo sapiens", category = "C2", subcategory = "CP:REACTOME")
sig <-rc[,c(3,4)]
sig$gs_name <-gsub("^REACTOME_","",sig$gs_name)
gsreac <- clusterProfiler::GSEA(genes,TERM2GENE = sig,
eps = 0,
verbose = TRUE,
minGSSize =5, pAdjustMethod = "fdr",
pvalueCutoff =0.1)
dotplot(gsreac, showCategory = 10, x="NES",split=".sign") + facet_grid(.~.sign)+
theme(axis.text.y = element_text(size = 7))

Load data and pre-processing
load(paste0(wdd,"data/txi.RData"))
patientDetails<-read.csv(paste0(wdd,"data/patientDetails_all_withMed.csv"),stringsAsFactors = F)
txi$counts <- txi$counts[,match(unique(as.character(patientDetails$name)),colnames(txi$counts))]
txi$abundance <- txi$abundance[,match(unique(as.character(patientDetails$name)),colnames(txi$abundance))]
txi$length <- txi$length[,match(unique(as.character(patientDetails$name)),colnames(txi$length))]
patientDetails <-patientDetails[,-3]
patientDetails<-patientDetails[!duplicated(patientDetails$name),]
patientDetails$OA<-ifelse(grepl("SH0", patientDetails$name),"OA","NonOA")
Batches<-as.factor(patientDetails$batch)
Disease<- as.factor(patientDetails$OA)
colData<-data.frame(Batches=Batches,Disease)
dds<-DESeqDataSetFromTximport(txi,colData,~Batches+Disease)
filter <- rowSums(counts(dds)>=5) >= 35 #at least 50% of the samples
dds <- dds[filter,]
tmp <-counts(dds)
tmp <-cbind(gene_id=rownames(tmp),tmp)
tmp <-merge(tmp,ens2gene,by.y="gene_id")
Fix duplicates: mean expression value across transcripts has been
used for each duplicate gene
dupp <-tmp[duplicated(tmp$gene_name),]
gn_dup <-unique(dupp$gene_name)
tmp1 <-tmp[!tmp$gene_name %in% gn_dup,]
counts2 <-as.data.frame(matrix(NA, length(gn_dup),ncol(tmp)))
colnames(counts2) <-colnames(tmp)
rownames(counts2) <-paste0("ENS_",1:length(gn_dup))
counts2 <-counts2[,-c(1,72)]
for(i in 1:length(gn_dup)){
x=gn_dup[i]
y=tmp[tmp$gene_name %in% x,-c(1,72)]
counts2[i,1:ncol(counts2)] <-apply(y,2,function(x) round(mean(as.numeric(x))))
}
counts2 <-cbind(gene_id=rownames(counts2),counts2,gene_name=gn_dup)
tmp3 <-rbind(tmp1,as.matrix(counts2))
ccounts <-tmp3[,2:(ncol(tmp3)-1)]
ccounts <-t(apply(ccounts,1,as.numeric))
rownames(ccounts) <-tmp3$gene_id
colnames(ccounts) <-colnames(tmp3[,2:(ncol(tmp3)-1)])
dds0 <- DESeqDataSetFromMatrix(countData = ccounts,
colData = colData,
design = ~Batches+Disease)
filter <- rowSums(counts(dds0)>=5) >= 35 #re-check low counts genes
dds0 <- dds0[filter,]
Principal component analysis
vsd <- vst(dds0, blind=FALSE)
pcaData <- plotPCA(vsd, intgroup=c("Batches", "Disease"), returnData=TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color=Batches, shape=Disease,label = rownames(pcaData))) +
geom_point(size=3,alpha=1) +
xlab(paste0("PC1: ",percentVar[1],"% variance")) +
ylab(paste0("PC2: ",percentVar[2],"% variance")) +
coord_fixed()+
ggtitle("All samples after merging replicate Runs")+
theme_bw()

Batch correction
mm <- model.matrix(~Disease, colData(vsd))
assay(vsd)<-limma::removeBatchEffect(assay(vsd), vsd$Batches, design = mm)
pcaData <- plotPCA(vsd, intgroup=c("Batches", "Disease"), returnData=TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color=Batches, shape=Disease,label = rownames(pcaData))) +
geom_point(size=3,alpha=1) +
xlab(paste0("PC1: ",percentVar[1],"% variance")) +
ylab(paste0("PC2: ",percentVar[2],"% variance")) +
coord_fixed()+
ggtitle("All samples after merging replicate Runs")+
theme_bw()

Differential expression analysis using DESeq2
dds2<-DESeq(dds0)
resultsNames(dds2)
## [1] "Intercept" "Batches_2_vs_1" "Batches_3_vs_1"
## [4] "Batches_4_vs_1" "Batches_5_vs_1" "Batches_6_vs_1"
## [7] "Batches_7_vs_1" "Batches_8_vs_1" "Batches_9_vs_1"
## [10] "Batches_10_vs_1" "Batches_11_vs_1" "Batches_12_vs_1"
## [13] "Batches_13_vs_1" "Disease_OA_vs_NonOA"
OAvsnonOA<-as.data.frame(results(dds2,name="Disease_OA_vs_NonOA",pAdjustMethod = "fdr"))
OAvsnonOA_dup <-OAvsnonOA[grep("ENS_",rownames(OAvsnonOA)),]
names(gn_dup) <-counts2$gene_id
setdiff(names(gn_dup),rownames(OAvsnonOA_dup)) ## check if some corrected duplicates did not exceed the minimum of counts for at least half of the patients
## [1] "ENS_202"
#[1] "ENS_202"
which(names(gn_dup) %in% "ENS_202")
## [1] 202
# 202
OAvsnonOA_dup <-OAvsnonOA_dup[names(gn_dup[-202]),]
OAvsnonOA_dup$hgnc_symbol <-gn_dup[-202]
OAvsnonOA_dup <-cbind(Row.names=rownames(OAvsnonOA_dup), OAvsnonOA_dup)
OAvsnonOA<-merge(OAvsnonOA,ens2gene,by.x="row.names",by.y="gene_id")
colnames(OAvsnonOA)[8] <-"hgnc_symbol"
OAvsnonOA_dup <-OAvsnonOA_dup[,colnames(OAvsnonOA)]
OAvsnonOA <-rbind(OAvsnonOA, OAvsnonOA_dup)
OAvsnonOA<-OAvsnonOA[order(OAvsnonOA$log2FoldChange,decreasing=TRUE),]
Save files
OAvsnonOA <-OAvsnonOA[complete.cases(OAvsnonOA),]
save(OAvsnonOA, dds0, OAvsnonOA_dup, gn_dup,file=paste0(MEDIAsave,"genide-bulk-unpaired.RData"))
save(OAvsnonOA,file=paste0(MEDIAsave,"genide-bulk-unpaired_DE.RData"))
Volcano Plot of DE genes
annot <-OAvsnonOA[,c(3,7,8)]
annot <-annot[complete.cases(annot),]
annot$col <-"gray50"
annot[annot$log2FoldChange <= -1.5 & annot$padj <=0.05, "col"] <- "springgreen3"
annot[annot$log2FoldChange >= 1.5 & annot$padj <=0.05, "col"] <- "red"
annot$label <- NA
annot[annot$log2FoldChange >= 3 & annot$padj <=0.05, "label"] <- annot[annot$log2FoldChange >= 3 & annot$padj <=0.05, "hgnc_symbol"]
annot[annot$log2FoldChange <= -3 & annot$padj <=0.05, "label"] <- annot[annot$log2FoldChange <= -3 & annot$padj <=0.05, "hgnc_symbol"]
ggplot(data=annot, aes(x=log2FoldChange, y=-log10(padj), color=col,label=label)) +
geom_point(size=1) +
geom_label_repel(size=4,
min.segment.length = 0,direction = "both",
max.overlaps =40,
color="black",
segment.linetype=2) +
scale_shape_manual(values=c(8, 19))+
scale_color_manual(values = c("red","gray50","springgreen3"),
breaks = c("red","gray50","springgreen3"),
labels = c("UP","notDE","DOWN")) +
labs(color="Legend")+
xlab("log2(FC)")+
ylab("-log10(FDR)")+
ggtitle("Volcano plot of DE genes - Dataset 3")+
theme_bw()+
geom_vline(xintercept=c(-2.5, 2.5), col="gray", linetype=2)+
geom_hline(yintercept=-log10(0.05), col="gray", linetype=2)

Enrichment with GSEA: BP and REACTOME
gene <-OAvsnonOA[order(OAvsnonOA$log2FoldChange, decreasing = TRUE),]
genes <-gene$log2FoldChange
names(genes) <-gene$hgnc_symbol
hs=get("org.Hs.eg.db")
gsebp2 <- gseGO(geneList=genes,
ont ="BP",
keyType = "SYMBOL",
pvalueCutoff = 0.01,
eps = 0,
verbose = TRUE,
OrgDb = hs,
pAdjustMethod = "fdr")
dotplot(gsebp2, showCategory = 10, x="NES",split=".sign") + facet_grid(.~.sign)

rc <-msigdbr(species = "Homo sapiens", category = "C2", subcategory = "CP:REACTOME")
sig <-rc[,c(3,4)]
sig$gs_name <-gsub("^REACTOME_","",sig$gs_name)
gsreac2 <- clusterProfiler::GSEA(genes,TERM2GENE = sig,
eps = 0,
verbose = TRUE,
minGSSize =5, pAdjustMethod = "fdr",
pvalueCutoff =0.1)
dotplot(gsreac2, showCategory = 10, x="NES",split=".sign") + facet_grid(.~.sign)+
theme(axis.text.y = element_text(size = 7))

Session Info
## R version 4.2.1 (2022-06-23)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.5 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/liblapack.so.3
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] msigdbr_7.5.1 gprofiler2_0.2.2
## [3] enrichplot_1.21.0 clusterProfiler_4.4.4
## [5] org.Hs.eg.db_3.15.0 ggrepel_0.9.3
## [7] ggplot2_3.4.3 EDASeq_2.30.0
## [9] ShortRead_1.54.0 GenomicAlignments_1.32.1
## [11] Rsamtools_2.12.0 Biostrings_2.64.1
## [13] XVector_0.36.0 BiocParallel_1.30.4
## [15] DESeq2_1.36.0 SummarizedExperiment_1.26.1
## [17] MatrixGenerics_1.8.1 matrixStats_1.0.0
## [19] readr_2.1.4 limma_3.52.4
## [21] EnsDb.Hsapiens.v75_2.99.0 EnsDb.Hsapiens.v79_2.99.0
## [23] ensembldb_2.20.2 AnnotationFilter_1.20.0
## [25] GenomicFeatures_1.48.4 AnnotationDbi_1.58.0
## [27] Biobase_2.56.0 GenomicRanges_1.48.0
## [29] GenomeInfoDb_1.32.4 IRanges_2.30.1
## [31] S4Vectors_0.34.0 BiocGenerics_0.42.0
## [33] dplyr_1.1.3
##
## loaded via a namespace (and not attached):
## [1] shadowtext_0.1.2 fastmatch_1.1-4 aroma.light_3.26.0
## [4] BiocFileCache_2.4.0 plyr_1.8.8 igraph_1.5.0
## [7] lazyeval_0.2.2 splines_4.2.1 digest_0.6.32
## [10] yulab.utils_0.0.6 htmltools_0.5.6 GOSemSim_2.22.0
## [13] viridis_0.6.3 GO.db_3.15.0 fansi_1.0.4
## [16] magrittr_2.0.3 memoise_2.0.1 tzdb_0.4.0
## [19] annotate_1.74.0 graphlayouts_1.0.0 R.utils_2.12.2
## [22] prettyunits_1.1.1 jpeg_0.1-10 colorspace_2.1-0
## [25] blob_1.2.4 rappdirs_0.3.3 xfun_0.39
## [28] crayon_1.5.2 RCurl_1.98-1.12 jsonlite_1.8.7
## [31] scatterpie_0.2.1 genefilter_1.78.0 ape_5.7-1
## [34] survival_3.5-5 glue_1.6.2 polyclip_1.10-4
## [37] gtable_0.3.4 zlibbioc_1.42.0 DelayedArray_0.22.0
## [40] scales_1.2.1 DOSE_3.22.1 DBI_1.1.3
## [43] Rcpp_1.0.11 viridisLite_0.4.2 xtable_1.8-4
## [46] progress_1.2.2 tidytree_0.4.2 gridGraphics_0.5-1
## [49] bit_4.0.5 htmlwidgets_1.6.2 httr_1.4.7
## [52] fgsea_1.22.0 RColorBrewer_1.1-3 pkgconfig_2.0.3
## [55] XML_3.99-0.14 R.methodsS3_1.8.2 farver_2.1.1
## [58] sass_0.4.7 dbplyr_2.3.3 deldir_1.0-9
## [61] locfit_1.5-9.8 utf8_1.2.3 labeling_0.4.3
## [64] ggplotify_0.1.0 tidyselect_1.2.0 rlang_1.1.1
## [67] reshape2_1.4.4 munsell_0.5.0 tools_4.2.1
## [70] cachem_1.0.8 downloader_0.4 cli_3.6.1
## [73] generics_0.1.3 RSQLite_2.3.1 evaluate_0.21
## [76] stringr_1.5.0 fastmap_1.1.1 yaml_2.3.7
## [79] ggtree_3.6.2 babelgene_22.9 knitr_1.43
## [82] bit64_4.0.5 tidygraph_1.2.3 purrr_1.0.1
## [85] KEGGREST_1.36.3 ggraph_2.1.0 nlme_3.1-162
## [88] R.oo_1.25.0 aplot_0.1.10 DO.db_2.9
## [91] xml2_1.3.4 biomaRt_2.52.0 compiler_4.2.1
## [94] rstudioapi_0.15.0 plotly_4.10.2 filelock_1.0.2
## [97] curl_5.0.1 png_0.1-8 treeio_1.20.2
## [100] tibble_3.2.1 tweenr_2.0.2 geneplotter_1.74.0
## [103] bslib_0.5.1 stringi_1.7.12 highr_0.10
## [106] lattice_0.21-8 ProtGenerics_1.28.0 Matrix_1.5-4.1
## [109] vctrs_0.6.3 pillar_1.9.0 lifecycle_1.0.3
## [112] jquerylib_0.1.4 data.table_1.14.8 bitops_1.0-7
## [115] patchwork_1.1.3 rtracklayer_1.56.1 qvalue_2.28.0
## [118] R6_2.5.1 BiocIO_1.6.0 latticeExtra_0.6-30
## [121] hwriter_1.3.2.1 gridExtra_2.3 codetools_0.2-19
## [124] MASS_7.3-58.3 rjson_0.2.21 withr_2.5.0
## [127] GenomeInfoDbData_1.2.8 parallel_4.2.1 hms_1.1.3
## [130] ggfun_0.1.0 grid_4.2.1 tidyr_1.3.0
## [133] rmarkdown_2.24 ggforce_0.4.1 interp_1.1-4
## [136] restfulr_0.0.15